Purpose

This script serves as a playing ground to figure out which variables serve best to build a good model, to predict our dependend variable.

remove(list = ls())
library(reactable)
library(sf)
library(data.table)
library(tidyverse)
library(corrplot)
library(reshape2)
library(igraph)
library(psych)
library(openxlsx)
sample <- readRDS("../data_raw/samples/sample_for_model_building.RDS")
ncol(sample) ### abzügzlich der 5 nicht inhaltlichen Variablen
## [1] 164
## how many colums are there
sample %>% dim()
## [1] 2260  164
sample <- sample %>% as.data.frame()

### delete administrative variables
tmp_variable <- c("i00","idep","iprov","imun","ID_mun","ID_prov")
sample <- sample[,!colnames(sample) %in% tmp_variable]

we have 164 variables, minus the 5 “administrative” ones results in 159 variables

Selection

Filter NAs

filter for variables which have lots of NAs

dat_table_nas <- tibble(
  "colnames" = colnames(sample),
  "pecentageNAs" = sapply(
    sample,
    FUN = function(x)
      round(mean(is.na(x)), 3)
  )
)

hist(dat_table_nas$pecentageNAs,breaks = 40)

reactable(dat_table_nas,filterable = T,compact = T,defaultPageSize = 15)

because we need values to actually contain information we will only take a look at the variables, that have at least 50% not NA

### leads to deletion of 
(dat_table_nas$pecentageNAs > .5) %>% sum()
## [1] 41
## how many 
dat_selection <- dat_table_nas %>% filter(pecentageNAs < .5)

Model With Single Predictor

In the next step i will run just a single model to get a rough idea of how well two variables are connected and could be used for prediction

Because of simplicity I will just ran a numeric and factorial model for each variable. I will exclude the variables which have way to many levels (over 500). This loop either jumps the variable, if the variable only has 1 level or over 500.

list_variables <- dat_selection$colnames

dat_ko_variables <- data_frame(
  "variable" = character(),
  "r_squared_num" = character(),
  "p_value_num" = character(),
  "r_squared_factor" = character(),
  "p_value_factor" = character(),
  "n_fac_lelves" = character(),
  "min" = character(),
  "qnt_1st" = character(),
  "median" = character(),
  "mean" = character(),
  "qnt_3rd" = character(),
  "max" = character(),
  "NA_percentage" = character()
)
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## ℹ Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
for (i in 1:length(list_variables)) {
  tmp_object <- list_variables[i]
  tmp_variable_num <- sample[[tmp_object]]
  tmp_variable_fac <- tmp_variable_num %>% as.factor()
  tmp_length_variable <- table(tmp_variable_fac) %>% length()
  
  print(paste0("now doing number: ", i))
  
  ### dont run model if more then 15 factor
  if (tmp_length_variable > 500) {
    tmp_data <- data_frame(
      "variable" = tmp_object,
      "r_squared_num" = NA,
      "p_value_num" = NA,
      "r_squared_factor" = NA,
      "p_value_factor" = NA,
      "n_fac_lelves" = tmp_length_variable,
      "min" = summary(tmp_variable_num)[1],
      "qnt_1st" = summary(tmp_variable_num)[2],
      "median" = summary(tmp_variable_num)[3],
      "mean" = summary(tmp_variable_num)[4],
      "qnt_3rd" = summary(tmp_variable_num)[5],
      "max" = summary(tmp_variable_num)[6],
      "NA_percentage" = mean(is.na(tmp_variable_num))
    )
    
    dat_ko_variables <- rbind(dat_ko_variables, tmp_data)
    print("early exit, too many")
    next
    
  } else if (tmp_length_variable <= 1) {
    tmp_data <- data_frame(
      "variable" = tmp_object,
      "r_squared_num" = NA,
      "p_value_num" = NA,
      "r_squared_factor" = NA,
      "p_value_factor" = NA,
      "n_fac_lelves" = tmp_length_variable,
      "min" = summary(tmp_variable_num)[1],
      "qnt_1st" = summary(tmp_variable_num)[2],
      "median" = summary(tmp_variable_num)[3],
      "mean" = summary(tmp_variable_num)[4],
      "qnt_3rd" = summary(tmp_variable_num)[5],
      "max" = summary(tmp_variable_num)[6],
      "NA_percentage" = mean(is.na(tmp_variable_num))
    )
    
    dat_ko_variables <- rbind(dat_ko_variables, tmp_data)
    print("early exit, one or less")
    next
    
  } else {
    formula <- as.formula(paste("aestudio ~", tmp_object))
    model_num <- lm(sample$aestudio ~ tmp_variable_num)
    model_fac <- lm(sample$aestudio ~ tmp_variable_fac)
    
    r_squared_num <- summary(model_num)$r.squared
    p_value_num <- summary(model_num)$coefficients[1, 4]
    
    r_squared_factor <- summary(model_fac)$r.squared
    p_value_factor <- summary(model_fac)$coefficients[1, 4]

    tmp_data <- data_frame(
      "variable" = tmp_object,
      "r_squared_num" = r_squared_num,
      "p_value_num" = p_value_num,
      "r_squared_factor" = r_squared_factor,
      "p_value_factor" = p_value_factor,
      "n_fac_lelves" = tmp_length_variable,
      "min" = summary(tmp_variable_num)[1],
      "qnt_1st" = summary(tmp_variable_num)[2],
      "median" = summary(tmp_variable_num)[3],
      "mean" = summary(tmp_variable_num)[4],
      "qnt_3rd" = summary(tmp_variable_num)[5],
      "max" = summary(tmp_variable_num)[6],
      "NA_percentage" = mean(is.na(tmp_variable_num))
    )
    dat_ko_variables <- rbind(dat_ko_variables, tmp_data)
  }
}
## [1] "now doing number: 1"
## [1] "now doing number: 2"
## [1] "now doing number: 3"
## [1] "now doing number: 4"
## [1] "now doing number: 5"
## [1] "now doing number: 6"
## [1] "now doing number: 7"
## [1] "now doing number: 8"
## [1] "now doing number: 9"
## [1] "now doing number: 10"
## [1] "now doing number: 11"
## [1] "now doing number: 12"
## [1] "now doing number: 13"
## [1] "now doing number: 14"
## [1] "now doing number: 15"
## [1] "now doing number: 16"
## [1] "now doing number: 17"
## [1] "now doing number: 18"
## [1] "now doing number: 19"
## [1] "now doing number: 20"
## [1] "now doing number: 21"
## [1] "now doing number: 22"
## [1] "now doing number: 23"
## [1] "now doing number: 24"
## [1] "now doing number: 25"
## [1] "now doing number: 26"
## [1] "now doing number: 27"
## [1] "now doing number: 28"
## [1] "now doing number: 29"
## [1] "now doing number: 30"
## [1] "now doing number: 31"
## [1] "now doing number: 32"
## [1] "now doing number: 33"
## [1] "now doing number: 34"
## [1] "now doing number: 35"
## [1] "now doing number: 36"
## [1] "now doing number: 37"
## [1] "now doing number: 38"
## [1] "now doing number: 39"
## [1] "now doing number: 40"
## [1] "now doing number: 41"
## [1] "now doing number: 42"
## [1] "now doing number: 43"
## [1] "now doing number: 44"
## [1] "now doing number: 45"
## [1] "now doing number: 46"
## [1] "now doing number: 47"
## [1] "now doing number: 48"
## Warning in summary.lm(model_num): essentially perfect fit: summary may be
## unreliable
## Warning in summary.lm(model_num): essentially perfect fit: summary may be
## unreliable
## [1] "now doing number: 49"
## [1] "now doing number: 50"
## [1] "now doing number: 51"
## [1] "now doing number: 52"
## [1] "now doing number: 53"
## [1] "now doing number: 54"
## [1] "now doing number: 55"
## [1] "now doing number: 56"
## [1] "now doing number: 57"
## [1] "now doing number: 58"
## [1] "now doing number: 59"
## [1] "now doing number: 60"
## [1] "now doing number: 61"
## [1] "now doing number: 62"
## [1] "now doing number: 63"
## [1] "now doing number: 64"
## [1] "now doing number: 65"
## [1] "now doing number: 66"
## [1] "now doing number: 67"
## [1] "now doing number: 68"
## [1] "now doing number: 69"
## [1] "now doing number: 70"
## [1] "now doing number: 71"
## [1] "now doing number: 72"
## [1] "now doing number: 73"
## [1] "now doing number: 74"
## [1] "now doing number: 75"
## [1] "now doing number: 76"
## [1] "now doing number: 77"
## [1] "now doing number: 78"
## [1] "early exit, one or less"
## [1] "now doing number: 79"
## [1] "now doing number: 80"
## [1] "now doing number: 81"
## [1] "now doing number: 82"
## [1] "now doing number: 83"
## [1] "now doing number: 84"
## [1] "now doing number: 85"
## [1] "now doing number: 86"
## [1] "now doing number: 87"
## [1] "now doing number: 88"
## [1] "now doing number: 89"
## [1] "now doing number: 90"
## [1] "now doing number: 91"
## [1] "now doing number: 92"
## [1] "now doing number: 93"
## [1] "now doing number: 94"
## [1] "now doing number: 95"
## [1] "now doing number: 96"
## [1] "now doing number: 97"
## [1] "now doing number: 98"
## [1] "now doing number: 99"
## [1] "now doing number: 100"
## [1] "now doing number: 101"
## [1] "now doing number: 102"
## [1] "now doing number: 103"
## [1] "now doing number: 104"
## [1] "now doing number: 105"
## [1] "now doing number: 106"
## [1] "now doing number: 107"
## [1] "now doing number: 108"
## [1] "now doing number: 109"
## [1] "now doing number: 110"
## [1] "now doing number: 111"
## [1] "now doing number: 112"
## [1] "now doing number: 113"
## [1] "now doing number: 114"
## [1] "now doing number: 115"
## [1] "now doing number: 116"
## [1] "now doing number: 117"
## convert column to numeric
dat_ko_variables$mean <- dat_ko_variables$mean %>% as.numeric() 

### round variables to 3 digits after 0 
dat_ko_variables <- dat_ko_variables %>%
  mutate(across(where(is.numeric), ~ round(.x, 3)))

dat_ko_variables %>% reactable(.,compact = T, filterable = T,defaultPageSize = 20)
dat_ko_variables <- dat_ko_variables[order(dat_ko_variables$r_squared_factor,decreasing = T),]

save as excel

For exploratory purposes I will export the data frame as an excel.
An older version of this exists which was used to select the variables, it has the suffix _filled.

write.xlsx(dat_ko_variables,"../data_raw/2024_census/processed/dat_ko_variables_selection.xlsx")

combined model

## remove aestudio
dat_ko_variables_selection <- dat_ko_variables %>% dplyr::filter(variable != "aestudio")

### remove variables with high fac levels
(dat_ko_variables_selection$n_fac_lelves >= 15) %>% sum()
## [1] 24
dat_ko_variables_selection <- dat_ko_variables_selection[dat_ko_variables_selection$n_fac_lelves < 15,]

# ### remove variables with high NA-ratio
# (dat_ko_variables_selection$NA_percentage > .25) %>% sum()
# dat_ko_variables_selection <- dat_ko_variables_selection[dat_ko_variables_selection$NA_percentage < .25,]

### remove variables, which are too close conceptually
dat_ko_variables_selection <- dat_ko_variables_selection %>% filter(!variable %in%  c("p41a_nivel","p41b_curso","p41a_nivel_act","p41b_curso_act","nivel_edu","p38_asiste", "asiste"))

Cramers V

sample$aestudio %>% is.na() %>% table()
## .
## FALSE 
##  2260
sample <- sample %>% as.data.frame()


### filter with factors over 15
idx <- sapply(sample,FUN = function(x) length(unique(x)) > 15)
idx %>% table()
## .
## FALSE  TRUE 
##   126    32
sample2 <- sample[, !idx]

### filter with na percentage higher then .5
idx <- sapply(sample2,FUN = function(x) mean(is.na(x)) > .5)
idx %>% table()
## .
## FALSE  TRUE 
##    93    33
sample2 <-  sample2[, !idx]

### other non informative variables filter 
sample2 <- sample2[,!colnames(sample2) %in% c("idep","imun")]

sample_fast <- data.frame(lapply(sample2, function(x) {
  if (is.factor(x)) as.numeric(x) else x
}))

sample2
library(vcd)
## Loading required package: grid
tmp_data <- matrix(ncol = 3,nrow = 0) %>% as.data.frame()
colnames(tmp_data) <- c("v1","v2","cramresV")

for(i in seq_along(sample2)){
  for(j in seq_along(sample2)){
  x1 <- sample2[,i] %>% as.factor()
  x2 <- sample2[,j] %>% as.factor()
  tbl <- table(x1, x2)
  tmp_vec <- c(colnames(sample2)[i],colnames(sample2)[j],assocstats(tbl)$cramer[[1]]) %>% unlist()
  tmp_data <- rbind(tmp_data,tmp_vec)
    
  }
}

colnames(tmp_data) <- c("v1","v2","cramresV")

Histogram

tmp_data$cramresV <- tmp_data$cramresV %>% as.numeric
tmp_data$cramresV %>% hist()

This is a histogramm showing the distribution of the correlation between all variables.

Correlation Matrix

cram_mat <- tmp_data %>%
  filter(v1 != v2) %>%                  # remove self-pairs
  pivot_wider(
    names_from  = v2,
    values_from = cramresV
  ) %>%
  column_to_rownames("v1") %>%
  as.matrix()

dim(cram_mat)
## [1] 93 93
corrplot(
  cram_mat,
  method = "color",
  type   = "lower",
  tl.cex = 0.6
)

cram_long <- melt(cram_mat)

corr_matrix_2 <-  ggplot(cram_long, aes(Var1, Var2, fill = value)) +
  geom_tile() +
  scale_fill_viridis_c() +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 90, hjust = 1, size = 6),
    axis.text.y = element_text(size = 6)
  ) +
  labs(fill = "Cramér's V")

corr_matrix_2

ggsave("../output/corr_matrix-2.png",plot = corr_matrix_2,width = 10,height = 6, device = "png",dpi = 300)
dat_selection <- tmp_data

### drop the pair with itself
dat_selection <- dat_selection[dat_selection$v1 != dat_selection$v2,]
dat_selection <- dat_selection[!is.na(dat_selection$cramresV),]
dat_selection$cramresV %>% hist()

dat_selection %>% reactable(.,compact = T,filterable = T)
### load in list from. A
list_variables <- readRDS("../data_raw/misc/list_variables_seleciton.RDS")

### filter for only these variables
dat_selection <- dat_selection[dat_selection$v1 %in% list_variables,]

dat_selection$pair_sorted <- apply(dat_selection[, c("v1","v2")], 1, function(x) paste(sort(x), collapse = "_"))
dat_selection_unique <- dat_selection[!duplicated(dat_selection$pair_sorted), ]

dat_selection_unique[,1:3] %>% reactable(.,compact = T,filterable = T)
# dat_selection_unique: v1, v2, cramresV
threshold <- 0.75

# keep only strong pairs
strong_pairs <- dat_selection_unique %>% filter(cramresV > threshold)

# build graph
g <- graph_from_data_frame(strong_pairs[, c("v1", "v2")], directed = FALSE)

# find connected components
comp <- components(g)

# create a dataframe with groups
group_df <- data.frame(
  var = names(comp$membership),
  group = comp$membership
)

# optional: collapse variables in each group into one row
grouped_vars <- group_df %>%
  group_by(group) %>%
  summarise(vars = paste(var, collapse = ", "), .groups = "drop")

reactable(grouped_vars)
# how many covariats currently in there
dat_ko_variables_selection$variable %>% length()
## [1] 85
dat_ko_variables_selection$variable
##  [1] "p49_ocu_1d"     "ocu_1d_19"      "ocu_1d_13"      "p40_lee"       
##  [5] "gedadedu"       "g_edad_bol"     "v19c_compu"     "p42c_camina"   
##  [9] "g_edad"         "p50_catocu_13"  "p53_ecivil"     "p42d_comuni"   
## [13] "p50_semp"       "v10_combus"     "v11_basura"     "v06_piso"      
## [17] "v19e_inetfijo"  "v16_desague"    "v18j_lavadora"  "v19e_f"        
## [21] "p42b_oir"       "p31_afiliado"   "p45_agro"       "urbrur"        
## [25] "v19b_tv"        "v19f_inetmovil" "p30b_caja"      "v18f_refri"    
## [29] "p42_discap"     "p42a_ver"       "v03_pared"      "p43_pago"      
## [33] "v18g_micro"     "v15_servsan"    "p32_pueblo_per" "v08_aguadist"  
## [37] "p52_mov"        "v19g_tvcable"   "v04_revoq"      "v17_tenencia"  
## [41] "v18c_auto"      "v09_energia"    "v19d_celular"   "v19h_d"        
## [45] "dep_nac_cod"    "dep_lab_cod"    "v14_dormit"     "v18i_aire"     
## [49] "dep_res_cod"    "v07_aguapro"    "dep_res5_cod"   "condact_19"    
## [53] "v05_techo"      "v18a_bici"      "v18h_calefon"   "tip_hog"       
## [57] "p37_lugres5"    "p25_sexo"       "p30c_privad"    "v13_habitac"   
## [61] "v18b_moto"      "v19a_radio"     "v19h_telfijo"   "p30d_atedom"   
## [65] "p36_lugres"     "v20a_emi"       "p30f_autome"    "p30e_tradic"   
## [69] "p35_lugnac"     "p44_nego"       "p29_ci"         "p30g_casera"   
## [73] "v18d_carreta"   "p28_cn"         "p30a_public"    "condact_13"    
## [77] "pea_13"         "v12_cocina"     "v18e_bote"      "v21a_fal"      
## [81] "p31_cobersalud" "pet_19"         "ft_19"          "pet_13"        
## [85] "v02_condocup"
### remove highly correlated items (9 gorups identified in script B)
# highly correlated with p28_cn, 
tmp_variables <- "p29_ci"
dat_ko_variables_selection <- dat_ko_variables_selection %>% filter(!variable %in% 
                                                                      tmp_variables)

# highly correlated with p30f_autome, 
tmp_variables <- "p30g_casera"
dat_ko_variables_selection <- dat_ko_variables_selection %>% filter(!variable %in% 
                                                                      tmp_variables)

# highly correlated with edad, 
tmp_variables <- c("g_edad","g_edad_bol","gedadedu")
dat_ko_variables_selection$variable %in% tmp_variables %>% sum()
## [1] 3
dat_ko_variables_selection <- dat_ko_variables_selection %>% filter(!variable %in% 
                                                                      tmp_variables)

# highly correlated with pet_19, 
tmp_variables <- c("condact_19","pet_13","pea_13","condact_13","ft_19") ## condact_13 already filtered out
tmp_variables %in% dat_ko_variables_selection$variable %>% sum()
## [1] 5
dat_ko_variables_selection <- dat_ko_variables_selection %>% filter(!variable %in% 
                                                                      tmp_variables)

# highly correlated with pet_19, 
tmp_variables <- c("condact_19","pet_13","pea_13","condact_13","ft_19") ### all of them already filtered out, only pet_19 in 
tmp_variables %in% dat_ko_variables_selection$variable %>% sum()
## [1] 0
dat_ko_variables_selection <- dat_ko_variables_selection %>% filter(!variable %in% 
                                                                      tmp_variables)

# highly correlated with ocu_1d_13, 
tmp_variables <- c("p49_ocu_1d","ocu_1d_19") ### all already filtered out
tmp_variables %in% dat_ko_variables_selection$variable %>% sum()
## [1] 2
dat_ko_variables_selection <- dat_ko_variables_selection %>% filter(!variable %in% 
                                                                      tmp_variables)

# highly correlated with p50_catocu_13, 
tmp_variables <- c("p50_semp") ### already filtered out
tmp_variables %in% dat_ko_variables_selection$variable %>% sum()
## [1] 1
dat_ko_variables_selection <- dat_ko_variables_selection %>% filter(!variable %in% 
                                                                      tmp_variables)

# highly correlated with v18f_refri, 
tmp_variables <- c("v18g_micro","v18j_lavadora","v19c_compu","v19e_inetfijo","v19b_tv","v19g_tvcable")
tmp_variables %in% dat_ko_variables_selection$variable %>% sum()
## [1] 6
dat_ko_variables_selection <- dat_ko_variables_selection %>% filter(!variable %in% 
                                                                      tmp_variables)

# highly correlated with v19d_celular, 
tmp_variables <- c("v19f_inetmovil","v19e_f","v19h_d")
tmp_variables %in% dat_ko_variables_selection$variable %>% sum()
## [1] 3
dat_ko_variables_selection <- dat_ko_variables_selection %>% filter(!variable %in% 
                                                                      tmp_variables)

### remove because of singularity 
tmp_variables <- c("v02_condocup")
tmp_variables %in% dat_ko_variables_selection$variable %>% sum()
## [1] 1
dat_ko_variables_selection <- dat_ko_variables_selection %>% filter(!variable %in% 
                                                                      tmp_variables)

### remove municipality codes 
tmp_variables <- c("dep_res_cod","dep_res5_cod","dep_lab_cod","dep_nac_cod") ### dep_lab_cod already out 
tmp_variables %in% dat_ko_variables_selection$variable %>% sum()
## [1] 4
dat_ko_variables_selection <- dat_ko_variables_selection %>% filter(!variable %in% 
                                                                      tmp_variables)

candidates <- dat_ko_variables_selection[,"variable"]


sample <- sample %>% as.data.frame()
sample_candidates <- sample[,c("aestudio",candidates$variable) %in% colnames(sample)]

formula <- paste("aestudio ~",paste(c("p26_edad",candidates$variable),collapse =  " + " ))
formula <- formula %>% as.formula()

Linear Models

# sample$p332_idiohab2_cod[is.na(sample$p332_idiohab2_cod)] <- "Missing"
### replace all NAs with Missing, so its its own factor
## becase sample_candidates[,candidates$variable] %>% na.omit()
## in a big shrinkage of the data frame
sample_candidates[,candidates$variable][is.na(sample_candidates[,candidates$variable])] <- "Missing"

## convert all of them to factors 
for (col in candidates$variable) {
  sample_candidates[[col]] <- factor(sample_candidates[[col]])
}
lm(formula = formula,data = sample_candidates) %>% summary()
## 
## Call:
## lm(formula = formula, data = sample_candidates)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.8193  -2.3028   0.1004   2.2940  13.2958 
## 
## Coefficients: (43 not defined because of singularities)
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           19.2104219  3.9346049   4.882 1.13e-06 ***
## p26_edad              -0.0944648  0.0067601 -13.974  < 2e-16 ***
## ocu_1d_131            -0.2818353  4.0358810  -0.070 0.944334    
## ocu_1d_132             1.0557710  3.6409707   0.290 0.771868    
## ocu_1d_133            -1.7523047  3.6671352  -0.478 0.632814    
## ocu_1d_134            -2.5010378  3.6929435  -0.677 0.498324    
## ocu_1d_135            -4.4857981  3.6333252  -1.235 0.217109    
## ocu_1d_136            -5.7465548  3.6278550  -1.584 0.113344    
## ocu_1d_137            -4.9174309  3.6302191  -1.355 0.175698    
## ocu_1d_138            -5.2982827  3.6338236  -1.458 0.144979    
## ocu_1d_139            -5.7192420  3.6728639  -1.557 0.119585    
## ocu_1d_1397           -3.7254489  3.6805048  -1.012 0.311555    
## ocu_1d_1399           -6.3276195  3.6890172  -1.715 0.086447 .  
## ocu_1d_13Missing      -4.6322148  3.6710459  -1.262 0.207154    
## p40_lee2              -5.1112426  0.3433251 -14.887  < 2e-16 ***
## p40_lee9              -6.7071898  1.4655773  -4.576 5.01e-06 ***
## p42c_camina2          -0.4000825  0.2900252  -1.379 0.167897    
## p42c_camina3           0.3148241  0.6605354   0.477 0.633683    
## p42c_camina4          -0.8733877  1.2303573  -0.710 0.477868    
## p42c_camina9          -0.5558672  1.4070368  -0.395 0.692837    
## p50_catocu_132        -0.0799209  0.2586223  -0.309 0.757333    
## p50_catocu_133         0.5420167  0.7340109   0.738 0.460336    
## p50_catocu_134         0.3759010  0.3638362   1.033 0.301649    
## p50_catocu_135        -1.3521822  1.0756911  -1.257 0.208882    
## p50_catocu_136        -0.9553811  1.5438101  -0.619 0.536086    
## p50_catocu_139        -0.0533627  0.3138500  -0.170 0.865006    
## p50_catocu_13Missing          NA         NA      NA       NA    
## p53_ecivil2           -0.3706055  0.2258943  -1.641 0.101029    
## p53_ecivil3           -0.2817358  0.5035782  -0.559 0.575903    
## p53_ecivil4            0.9757556  0.6939424   1.406 0.159843    
## p53_ecivil5           -0.1037982  0.3801780  -0.273 0.784861    
## p53_ecivil6            0.4380234  0.2368831   1.849 0.064584 .  
## p53_ecivil9           -0.8644552  0.5480420  -1.577 0.114867    
## p42d_comuni2          -0.7457360  0.2750420  -2.711 0.006756 ** 
## p42d_comuni3          -0.1140062  0.7646217  -0.149 0.881488    
## p42d_comuni4          -0.3861464  1.4554750  -0.265 0.790800    
## p42d_comuni9           0.2877857  1.3638270   0.211 0.832898    
## v10_combus2           -0.1119280  0.3058863  -0.366 0.714467    
## v10_combus3           -0.2860762  0.2284975  -1.252 0.210715    
## v10_combus4           -1.1361343  0.8391952  -1.354 0.175935    
## v10_combus6            1.8756787  3.5616813   0.527 0.598509    
## v10_combus7            4.1596093  2.5741540   1.616 0.106265    
## v10_combus8           -0.5630082  0.9342084  -0.603 0.546802    
## v10_combusMissing     -1.0976314  1.2800020  -0.858 0.391255    
## v11_basura2            0.0110451  0.3076631   0.036 0.971366    
## v11_basura3           -0.8874901  0.5178427  -1.714 0.086711 .  
## v11_basura4           -0.4015246  0.5369580  -0.748 0.454680    
## v11_basura5           -0.3825404  0.3351300  -1.141 0.253806    
## v11_basura6           -0.0370118  0.4191637  -0.088 0.929647    
## v11_basura7            0.6433218  0.8070238   0.797 0.425453    
## v11_basuraMissing             NA         NA      NA       NA    
## v06_piso2              1.5614058  0.5570684   2.803 0.005112 ** 
## v06_piso3              2.0704270  0.6363746   3.253 0.001158 ** 
## v06_piso4              0.6615894  0.3252521   2.034 0.042070 *  
## v06_piso5              0.2450679  0.2378498   1.030 0.302967    
## v06_piso6              0.3346494  0.8144418   0.411 0.681192    
## v06_piso7              1.0352587  0.4493166   2.304 0.021317 *  
## v06_piso8              0.9321456  1.3820962   0.674 0.500105    
## v06_piso9              2.4671876  1.0286138   2.399 0.016548 *  
## v06_pisoMissing               NA         NA      NA       NA    
## v16_desague2           0.4588161  0.3495231   1.313 0.189432    
## v16_desague3          -0.8201033  0.2561343  -3.202 0.001386 ** 
## v16_desague4          -0.7172989  0.5914831  -1.213 0.225378    
## v16_desague5          -1.6533004  0.8462413  -1.954 0.050871 .  
## v16_desague6          -0.8536275  0.7966089  -1.072 0.284035    
## v16_desagueMissing    -0.8706840  0.2985507  -2.916 0.003579 ** 
## p42b_oir2             -0.0014114  0.2873346  -0.005 0.996081    
## p42b_oir3             -0.1241268  0.7151049  -0.174 0.862214    
## p42b_oir4             -0.2764868  1.5749996  -0.176 0.860667    
## p42b_oir9             -0.5553549  1.5909251  -0.349 0.727067    
## p31_afiliado2          0.9077778  0.3437572   2.641 0.008334 ** 
## p31_afiliado3          2.6840713  1.0283822   2.610 0.009119 ** 
## p31_afiliado4         -0.2683604  0.2198853  -1.220 0.222431    
## p31_afiliado9          0.4972290  0.7398978   0.672 0.501643    
## p45_agro2              0.4890574  0.3936882   1.242 0.214286    
## p45_agro9              0.7078342  1.1282783   0.627 0.530494    
## p45_agroMissing        0.5668453  0.3097008   1.830 0.067348 .  
## urbrur2                0.2250764  0.2404200   0.936 0.349289    
## p30b_caja2            -0.7216982  0.2695941  -2.677 0.007487 ** 
## p30b_caja9             1.4770033  4.4005212   0.336 0.737174    
## v18f_refri2           -0.0029766  0.2160107  -0.014 0.989007    
## v18f_refri9           -3.1240721  6.5128212  -0.480 0.631505    
## v18f_refriMissing             NA         NA      NA       NA    
## p42_discap2            0.4957764  0.6329367   0.783 0.433543    
## p42_discapMissing      0.3944657  1.4825118   0.266 0.790205    
## p42a_ver2              0.0377055  0.2131570   0.177 0.859612    
## p42a_ver3             -0.0332640  0.6417041  -0.052 0.958664    
## p42a_ver4              1.4563954  1.1384931   1.279 0.200959    
## p42a_ver9              0.8698975  1.7694169   0.492 0.623033    
## v03_pared2             0.3489045  0.2219958   1.572 0.116179    
## v03_pared3             0.2549566  1.1618921   0.219 0.826335    
## v03_pared4             0.1664070  0.5472914   0.304 0.761116    
## v03_pared5            -0.0582422  0.3774089  -0.154 0.877372    
## v03_pared6             1.7531384  0.9774126   1.794 0.073014 .  
## v03_pared7             0.9747964  0.7768404   1.255 0.209685    
## v03_paredMissing              NA         NA      NA       NA    
## p43_pago2             -0.0378334  0.2655563  -0.142 0.886724    
## p43_pago9                     NA         NA      NA       NA    
## v15_servsan2           0.1889236  0.2850345   0.663 0.507526    
## v15_servsan3                  NA         NA      NA       NA    
## v15_servsanMissing            NA         NA      NA       NA    
## p32_pueblo_per2        0.1846702  0.1772236   1.042 0.297525    
## p32_pueblo_per9        0.3671801  0.6590916   0.557 0.577519    
## v08_aguadist2         -0.2672088  0.2160463  -1.237 0.216296    
## v08_aguadist3         -0.0822314  0.3939978  -0.209 0.834695    
## v08_aguadistMissing           NA         NA      NA       NA    
## p52_mov2              -0.0214448  0.2420912  -0.089 0.929423    
## p52_mov3               0.1892796  0.3657375   0.518 0.604842    
## p52_mov4              -2.5862087  2.1604884  -1.197 0.231424    
## p52_mov9               0.6561976  0.4523655   1.451 0.147045    
## p52_movMissing        -0.0806859  0.4105565  -0.197 0.844216    
## v04_revoq2            -0.4707360  0.2215429  -2.125 0.033720 *  
## v04_revoqMissing              NA         NA      NA       NA    
## v17_tenencia2         -0.6237885  0.3977864  -1.568 0.116999    
## v17_tenencia3         -0.3879971  0.3616342  -1.073 0.283441    
## v17_tenencia4         -0.2712238  0.3614987  -0.750 0.453174    
## v17_tenencia5          1.1752844  0.8376042   1.403 0.160722    
## v17_tenencia6         -0.2275831  1.6141247  -0.141 0.887888    
## v17_tenencia7         -0.6972228  0.6221163  -1.121 0.262533    
## v17_tenencia8         -0.0001743  0.5874069   0.000 0.999763    
## v17_tenenciaMissing           NA         NA      NA       NA    
## v18c_auto2            -0.2297225  0.2063698  -1.113 0.265769    
## v18c_auto9                    NA         NA      NA       NA    
## v18c_autoMissing              NA         NA      NA       NA    
## v09_energia2           0.0584269  0.7276295   0.080 0.936008    
## v09_energia3           0.1503551  0.4353706   0.345 0.729866    
## v09_energia4          -0.0039199  0.8196860  -0.005 0.996185    
## v09_energia5          -0.2670975  0.2638030  -1.012 0.311422    
## v09_energiaMissing            NA         NA      NA       NA    
## v19d_celular2         -0.2760474  0.2508965  -1.100 0.271353    
## v19d_celular9         -0.1234754  3.6787317  -0.034 0.973228    
## v19d_celularMissing           NA         NA      NA       NA    
## v14_dormit1            0.3608706  0.4471501   0.807 0.419733    
## v14_dormit2            0.3365191  0.5028521   0.669 0.503429    
## v14_dormit3            0.0806014  0.5503906   0.146 0.883585    
## v14_dormit4            0.1154080  0.6434357   0.179 0.857671    
## v14_dormit5            0.9615351  0.8519542   1.129 0.259187    
## v14_dormit6           -0.5157838  1.1840459  -0.436 0.663164    
## v14_dormit7            1.3640393  2.2616124   0.603 0.546490    
## v14_dormit8           -1.1205601  1.8484363  -0.606 0.544435    
## v14_dormitMissing             NA         NA      NA       NA    
## v18i_aire2            -0.2890207  0.4779831  -0.605 0.545466    
## v18i_aire9                    NA         NA      NA       NA    
## v18i_aireMissing              NA         NA      NA       NA    
## v07_aguapro2          -0.5844105  0.4551512  -1.284 0.199288    
## v07_aguapro3           1.1948062  0.5332159   2.241 0.025147 *  
## v07_aguapro4          -0.2091616  0.3737204  -0.560 0.575762    
## v07_aguapro5           0.3498870  0.4454506   0.785 0.432269    
## v07_aguapro6           0.3687814  0.4085629   0.903 0.366827    
## v07_aguapro7           0.2763270  0.4356201   0.634 0.525935    
## v07_aguapro8          -0.9033876  0.7224930  -1.250 0.211303    
## v07_aguapro9           0.4640759  0.6815507   0.681 0.496003    
## v07_aguaproMissing            NA         NA      NA       NA    
## v05_techo2            -0.3292063  0.2207815  -1.491 0.136088    
## v05_techo3             0.0729969  0.3959786   0.184 0.853760    
## v05_techo4            -0.0425353  0.2776541  -0.153 0.878259    
## v05_techo5            -0.3987052  0.6550317  -0.609 0.542803    
## v05_techoMissing              NA         NA      NA       NA    
## v18a_bici2            -0.4681932  0.1779664  -2.631 0.008581 ** 
## v18a_bici9            -3.5237041  7.4000349  -0.476 0.634001    
## v18a_biciMissing              NA         NA      NA       NA    
## v18h_calefon2         -0.8530397  0.4848019  -1.760 0.078629 .  
## v18h_calefon9                 NA         NA      NA       NA    
## v18h_calefonMissing           NA         NA      NA       NA    
## tip_hog2              -0.0862737  0.3771540  -0.229 0.819086    
## tip_hog3              -0.0663085  0.3036041  -0.218 0.827135    
## tip_hog4              -0.2105191  0.2761963  -0.762 0.446022    
## tip_hog5               0.2451188  0.2590300   0.946 0.344108    
## tip_hog6              -0.2236652  0.4123510  -0.542 0.587591    
## tip_hog7               0.0474900  1.8252988   0.026 0.979246    
## tip_hogMissing                NA         NA      NA       NA    
## p37_lugres52           0.6001530  0.2696144   2.226 0.026123 *  
## p37_lugres53           0.6061429  0.8639428   0.702 0.483007    
## p37_lugres59           0.1516095  0.8439972   0.180 0.857458    
## p25_sexo2              0.6374010  0.1701282   3.747 0.000184 ***
## p30c_privad2           0.0064150  0.2218082   0.029 0.976930    
## p30c_privad9          -2.4268429  3.9583784  -0.613 0.539884    
## v13_habitac2          -0.4659830  0.2463923  -1.891 0.058734 .  
## v13_habitac3           0.0222914  0.3113937   0.072 0.942938    
## v13_habitac4          -0.1232138  0.3850329  -0.320 0.748994    
## v13_habitac5          -0.2919635  0.4973574  -0.587 0.557248    
## v13_habitac6           1.2394283  0.6443194   1.924 0.054538 .  
## v13_habitac7           0.1539542  0.8390673   0.183 0.854437    
## v13_habitac8           0.7395351  0.8350556   0.886 0.375929    
## v13_habitacMissing            NA         NA      NA       NA    
## v18b_moto2             0.1350868  0.1896116   0.712 0.476273    
## v18b_moto9                    NA         NA      NA       NA    
## v18b_motoMissing              NA         NA      NA       NA    
## v19a_radio2           -0.1787940  0.1711935  -1.044 0.296423    
## v19a_radio9            2.8176183  6.2539598   0.451 0.652373    
## v19a_radioMissing             NA         NA      NA       NA    
## v19h_telfijo2         -0.3588507  0.4478725  -0.801 0.423088    
## v19h_telfijo9         -0.7875086  5.1307856  -0.153 0.878029    
## v19h_telfijoMissing           NA         NA      NA       NA    
## p30d_atedom2           0.3972084  0.2597692   1.529 0.126396    
## p30d_atedom9                  NA         NA      NA       NA    
## p36_lugres2            0.3201246  0.4008118   0.799 0.424561    
## v20a_emi2              0.5480362  0.2744377   1.997 0.045962 *  
## v20a_emiMissing               NA         NA      NA       NA    
## p30f_autome2          -0.1795617  0.1637213  -1.097 0.272877    
## p30f_autome9                  NA         NA      NA       NA    
## p30e_tradic2           0.1567589  0.1721195   0.911 0.362529    
## p30e_tradic9                  NA         NA      NA       NA    
## p35_lugnac2           -0.1810258  0.1798978  -1.006 0.314403    
## p35_lugnac3            0.6683294  0.9246367   0.723 0.469883    
## p35_lugnac9            0.5102827  1.3209981   0.386 0.699325    
## p44_nego2                     NA         NA      NA       NA    
## p44_nego9                     NA         NA      NA       NA    
## p44_negoMissing               NA         NA      NA       NA    
## v18d_carreta2          0.0718181  0.3650621   0.197 0.844059    
## v18d_carreta9                 NA         NA      NA       NA    
## v18d_carretaMissing           NA         NA      NA       NA    
## p28_cn2               -0.9838794  1.2524833  -0.786 0.432225    
## p28_cn9               -0.6870736  2.7595043  -0.249 0.803397    
## p30a_public2          -0.1852839  0.2141764  -0.865 0.387084    
## p30a_public9           1.0773741  2.1579944   0.499 0.617658    
## v12_cocina2            0.5270537  0.2050186   2.571 0.010217 *  
## v12_cocinaMissing             NA         NA      NA       NA    
## v18e_bote2            -0.4763776  0.5142472  -0.926 0.354367    
## v18e_bote9             2.1666032  3.5973508   0.602 0.547055    
## v18e_boteMissing              NA         NA      NA       NA    
## v21a_fal2              0.2546112  0.2605269   0.977 0.328538    
## v21a_falMissing               NA         NA      NA       NA    
## p31_cobersalud2               NA         NA      NA       NA    
## p31_cobersaludMissing         NA         NA      NA       NA    
## pet_199               -1.1060303  0.5637385  -1.962 0.049901 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.467 on 2077 degrees of freedom
## Multiple R-squared:  0.5806, Adjusted R-squared:  0.5438 
## F-statistic:  15.8 on 182 and 2077 DF,  p-value: < 2.2e-16

Step-Wise-Regression: AIC

Backward

model_full <- lm(formula = formula,data = sample_candidates)
model_null <- lm(formula = aestudio ~ 1,data = sample_candidates)
# model_full %>% summary()

model_backward <- step(model_full,direction = "backward",trace = 0)
# model_backward %>% summary()
model_forward <- step(model_null,scope = formula, direction = "forward", trace = 0)
# model_forward %>% summary()
model_both <- step(model_null,scope = formula, direction = "both", trace = 0)
# model_both %>% summary()
model_backward$call %>% print() 
## lm(formula = aestudio ~ p26_edad + ocu_1d_13 + p40_lee + p53_ecivil + 
##     p42d_comuni + v06_piso + v16_desague + p31_afiliado + p30b_caja + 
##     v04_revoq + v18c_auto + v07_aguapro + v18a_bici + v18h_calefon + 
##     p37_lugres5 + p25_sexo + v20a_emi + v12_cocina + pet_19, 
##     data = sample_candidates)
model_forward$call %>% print() 
## lm(formula = aestudio ~ p40_lee + ocu_1d_13 + p26_edad + v16_desague + 
##     p31_afiliado + v06_piso + p53_ecivil + p25_sexo + v04_revoq + 
##     v12_cocina + v18a_bici + p42d_comuni + p37_lugres5 + p30b_caja + 
##     pet_19 + v07_aguapro + v18c_auto + v18h_calefon + v20a_emi, 
##     data = sample_candidates)
model_both$call %>% print() 
## lm(formula = aestudio ~ p40_lee + ocu_1d_13 + p26_edad + v16_desague + 
##     p31_afiliado + v06_piso + p53_ecivil + p25_sexo + v04_revoq + 
##     v12_cocina + v18a_bici + p42d_comuni + p37_lugres5 + p30b_caja + 
##     pet_19 + v07_aguapro + v18c_auto + v18h_calefon + v20a_emi, 
##     data = sample_candidates)

Step-Wise-Regression: BIC

model_backward_BIC <- step(model_full,direction = "backward",trace = 0, k = log(nrow(sample_candidates)))
# model_backward_BIC %>% summary()
model_forward_BIC <- step(model_null,scope = formula, direction = "forward",trace = 0, k = log(nrow(sample_candidates)))
# model_forward_BIC %>% summary()
model_both_BIC <- step(model_null,scope = formula, direction = "both", trace = 0,k =  log(nrow(sample_candidates)))
# model_both_BIC %>% summary()
model_backward_BIC$call %>% print() 
## lm(formula = aestudio ~ p26_edad + ocu_1d_13 + p40_lee + v16_desague + 
##     p30b_caja + v04_revoq + p25_sexo, data = sample_candidates)
model_forward_BIC$call %>% print() 
## lm(formula = aestudio ~ p40_lee + ocu_1d_13 + p26_edad + v16_desague + 
##     p31_afiliado + v04_revoq + p25_sexo + v12_cocina, data = sample_candidates)
model_both_BIC$call %>% print() 
## lm(formula = aestudio ~ p40_lee + ocu_1d_13 + p26_edad + v16_desague + 
##     p31_afiliado + v04_revoq + p25_sexo + v12_cocina, data = sample_candidates)

model_backward_BIC\(call %>% print() model_forward_BIC\)call %>% print() model_both_BIC$call %>% print()

model_backward_BIC %>% summary()
## 
## Call:
## lm(formula = aestudio ~ p26_edad + ocu_1d_13 + p40_lee + v16_desague + 
##     p30b_caja + v04_revoq + p25_sexo, data = sample_candidates)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -11.932  -2.581   0.293   2.256  17.511 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         24.025162   3.585235   6.701 2.61e-11 ***
## p26_edad            -0.102984   0.004832 -21.314  < 2e-16 ***
## ocu_1d_131          -2.959064   3.975656  -0.744 0.456776    
## ocu_1d_132          -1.889711   3.576398  -0.528 0.597286    
## ocu_1d_133          -4.836819   3.600891  -1.343 0.179334    
## ocu_1d_134          -5.611807   3.625485  -1.548 0.121794    
## ocu_1d_135          -7.815453   3.564550  -2.193 0.028443 *  
## ocu_1d_136          -9.521052   3.559799  -2.675 0.007536 ** 
## ocu_1d_137          -8.461714   3.564479  -2.374 0.017685 *  
## ocu_1d_138          -8.570574   3.571843  -2.399 0.016500 *  
## ocu_1d_139          -9.741265   3.589283  -2.714 0.006699 ** 
## ocu_1d_1397         -6.961707   3.605236  -1.931 0.053610 .  
## ocu_1d_1399        -10.090738   3.617356  -2.790 0.005323 ** 
## ocu_1d_13Missing    -8.262454   3.562049  -2.320 0.020453 *  
## p40_lee2            -5.811020   0.313822 -18.517  < 2e-16 ***
## p40_lee9            -7.321805   1.348128  -5.431 6.21e-08 ***
## v16_desague2         0.019360   0.326278   0.059 0.952690    
## v16_desague3        -1.404211   0.213205  -6.586 5.61e-11 ***
## v16_desague4        -1.200082   0.547878  -2.190 0.028597 *  
## v16_desague5        -2.246377   0.831129  -2.703 0.006928 ** 
## v16_desague6        -1.317273   0.750439  -1.755 0.079339 .  
## v16_desagueMissing  -1.259412   0.220337  -5.716 1.24e-08 ***
## p30b_caja2          -1.159032   0.233110  -4.972 7.13e-07 ***
## p30b_caja9          -0.549945   0.777060  -0.708 0.479190    
## v04_revoq2          -0.780262   0.169905  -4.592 4.63e-06 ***
## v04_revoqMissing     0.652346   0.515976   1.264 0.206257    
## p25_sexo2            0.602533   0.161599   3.729 0.000197 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.543 on 2233 degrees of freedom
## Multiple R-squared:  0.529,  Adjusted R-squared:  0.5235 
## F-statistic: 96.47 on 26 and 2233 DF,  p-value: < 2.2e-16
model_forward_BIC %>% summary()
## 
## Call:
## lm(formula = aestudio ~ p40_lee + ocu_1d_13 + p26_edad + v16_desague + 
##     p31_afiliado + v04_revoq + p25_sexo + v12_cocina, data = sample_candidates)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.9372  -2.5098   0.2672   2.2435  17.8286 
## 
## Coefficients: (1 not defined because of singularities)
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        22.718375   3.558611   6.384 2.09e-10 ***
## p40_lee2           -5.815999   0.312229 -18.627  < 2e-16 ***
## p40_lee9           -7.253293   1.341683  -5.406 7.13e-08 ***
## ocu_1d_131         -2.904840   3.958019  -0.734  0.46308    
## ocu_1d_132         -1.850918   3.559916  -0.520  0.60316    
## ocu_1d_133         -4.966381   3.584898  -1.385  0.16608    
## ocu_1d_134         -5.514007   3.608393  -1.528  0.12663    
## ocu_1d_135         -7.731044   3.548447  -2.179  0.02946 *  
## ocu_1d_136         -9.434455   3.543657  -2.662  0.00782 ** 
## ocu_1d_137         -8.408634   3.548310  -2.370  0.01788 *  
## ocu_1d_138         -8.556907   3.555635  -2.407  0.01618 *  
## ocu_1d_139         -9.669341   3.572871  -2.706  0.00686 ** 
## ocu_1d_1397        -6.866536   3.588738  -1.913  0.05583 .  
## ocu_1d_1399        -9.882508   3.601084  -2.744  0.00611 ** 
## ocu_1d_13Missing   -8.219553   3.545954  -2.318  0.02054 *  
## p26_edad           -0.102137   0.004801 -21.272  < 2e-16 ***
## v16_desague2        0.077416   0.324817   0.238  0.81164    
## v16_desague3       -1.411071   0.212120  -6.652 3.62e-11 ***
## v16_desague4       -1.197858   0.545128  -2.197  0.02810 *  
## v16_desague5       -2.088753   0.827902  -2.523  0.01171 *  
## v16_desague6       -1.397162   0.744947  -1.876  0.06085 .  
## v16_desagueMissing -1.382140   0.222034  -6.225 5.74e-10 ***
## p31_afiliado2       1.518594   0.288538   5.263 1.55e-07 ***
## p31_afiliado3       3.118287   0.988771   3.154  0.00163 ** 
## p31_afiliado4      -0.218776   0.204459  -1.070  0.28472    
## p31_afiliado9       0.691986   0.560125   1.235  0.21681    
## v04_revoq2         -0.833052   0.172271  -4.836 1.42e-06 ***
## v04_revoqMissing    0.791849   0.517508   1.530  0.12613    
## p25_sexo2           0.638829   0.161676   3.951 8.02e-05 ***
## v12_cocina2         0.508947   0.184122   2.764  0.00575 ** 
## v12_cocinaMissing         NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.527 on 2230 degrees of freedom
## Multiple R-squared:  0.5339, Adjusted R-squared:  0.5279 
## F-statistic:  88.1 on 29 and 2230 DF,  p-value: < 2.2e-16
model_both_BIC %>% summary()
## 
## Call:
## lm(formula = aestudio ~ p40_lee + ocu_1d_13 + p26_edad + v16_desague + 
##     p31_afiliado + v04_revoq + p25_sexo + v12_cocina, data = sample_candidates)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -11.9372  -2.5098   0.2672   2.2435  17.8286 
## 
## Coefficients: (1 not defined because of singularities)
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        22.718375   3.558611   6.384 2.09e-10 ***
## p40_lee2           -5.815999   0.312229 -18.627  < 2e-16 ***
## p40_lee9           -7.253293   1.341683  -5.406 7.13e-08 ***
## ocu_1d_131         -2.904840   3.958019  -0.734  0.46308    
## ocu_1d_132         -1.850918   3.559916  -0.520  0.60316    
## ocu_1d_133         -4.966381   3.584898  -1.385  0.16608    
## ocu_1d_134         -5.514007   3.608393  -1.528  0.12663    
## ocu_1d_135         -7.731044   3.548447  -2.179  0.02946 *  
## ocu_1d_136         -9.434455   3.543657  -2.662  0.00782 ** 
## ocu_1d_137         -8.408634   3.548310  -2.370  0.01788 *  
## ocu_1d_138         -8.556907   3.555635  -2.407  0.01618 *  
## ocu_1d_139         -9.669341   3.572871  -2.706  0.00686 ** 
## ocu_1d_1397        -6.866536   3.588738  -1.913  0.05583 .  
## ocu_1d_1399        -9.882508   3.601084  -2.744  0.00611 ** 
## ocu_1d_13Missing   -8.219553   3.545954  -2.318  0.02054 *  
## p26_edad           -0.102137   0.004801 -21.272  < 2e-16 ***
## v16_desague2        0.077416   0.324817   0.238  0.81164    
## v16_desague3       -1.411071   0.212120  -6.652 3.62e-11 ***
## v16_desague4       -1.197858   0.545128  -2.197  0.02810 *  
## v16_desague5       -2.088753   0.827902  -2.523  0.01171 *  
## v16_desague6       -1.397162   0.744947  -1.876  0.06085 .  
## v16_desagueMissing -1.382140   0.222034  -6.225 5.74e-10 ***
## p31_afiliado2       1.518594   0.288538   5.263 1.55e-07 ***
## p31_afiliado3       3.118287   0.988771   3.154  0.00163 ** 
## p31_afiliado4      -0.218776   0.204459  -1.070  0.28472    
## p31_afiliado9       0.691986   0.560125   1.235  0.21681    
## v04_revoq2         -0.833052   0.172271  -4.836 1.42e-06 ***
## v04_revoqMissing    0.791849   0.517508   1.530  0.12613    
## p25_sexo2           0.638829   0.161676   3.951 8.02e-05 ***
## v12_cocina2         0.508947   0.184122   2.764  0.00575 ** 
## v12_cocinaMissing         NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.527 on 2230 degrees of freedom
## Multiple R-squared:  0.5339, Adjusted R-squared:  0.5279 
## F-statistic:  88.1 on 29 and 2230 DF,  p-value: < 2.2e-16
## covariats for the AIC 
covariates <- names(model_both$coefficients)
covariates <- covariates[covariates != "(Intercept)"]
covariates %>% length()
## [1] 75
covariates
##  [1] "p40_lee2"            "p40_lee9"            "ocu_1d_131"         
##  [4] "ocu_1d_132"          "ocu_1d_133"          "ocu_1d_134"         
##  [7] "ocu_1d_135"          "ocu_1d_136"          "ocu_1d_137"         
## [10] "ocu_1d_138"          "ocu_1d_139"          "ocu_1d_1397"        
## [13] "ocu_1d_1399"         "ocu_1d_13Missing"    "p26_edad"           
## [16] "v16_desague2"        "v16_desague3"        "v16_desague4"       
## [19] "v16_desague5"        "v16_desague6"        "v16_desagueMissing" 
## [22] "p31_afiliado2"       "p31_afiliado3"       "p31_afiliado4"      
## [25] "p31_afiliado9"       "v06_piso2"           "v06_piso3"          
## [28] "v06_piso4"           "v06_piso5"           "v06_piso6"          
## [31] "v06_piso7"           "v06_piso8"           "v06_piso9"          
## [34] "v06_pisoMissing"     "p53_ecivil2"         "p53_ecivil3"        
## [37] "p53_ecivil4"         "p53_ecivil5"         "p53_ecivil6"        
## [40] "p53_ecivil9"         "p25_sexo2"           "v04_revoq2"         
## [43] "v04_revoqMissing"    "v12_cocina2"         "v12_cocinaMissing"  
## [46] "v18a_bici2"          "v18a_bici9"          "v18a_biciMissing"   
## [49] "p42d_comuni2"        "p42d_comuni3"        "p42d_comuni4"       
## [52] "p42d_comuni9"        "p37_lugres52"        "p37_lugres53"       
## [55] "p37_lugres59"        "p30b_caja2"          "p30b_caja9"         
## [58] "pet_199"             "v07_aguapro2"        "v07_aguapro3"       
## [61] "v07_aguapro4"        "v07_aguapro5"        "v07_aguapro6"       
## [64] "v07_aguapro7"        "v07_aguapro8"        "v07_aguapro9"       
## [67] "v07_aguaproMissing"  "v18c_auto2"          "v18c_auto9"         
## [70] "v18c_autoMissing"    "v18h_calefon2"       "v18h_calefon9"      
## [73] "v18h_calefonMissing" "v20a_emi2"           "v20a_emiMissing"
## covariats for the BIC 
covariates <- names(model_both_BIC$coefficients)
covariates <- covariates[covariates != "(Intercept)"]
covariates %>% length()
## [1] 30
covariates
##  [1] "p40_lee2"           "p40_lee9"           "ocu_1d_131"        
##  [4] "ocu_1d_132"         "ocu_1d_133"         "ocu_1d_134"        
##  [7] "ocu_1d_135"         "ocu_1d_136"         "ocu_1d_137"        
## [10] "ocu_1d_138"         "ocu_1d_139"         "ocu_1d_1397"       
## [13] "ocu_1d_1399"        "ocu_1d_13Missing"   "p26_edad"          
## [16] "v16_desague2"       "v16_desague3"       "v16_desague4"      
## [19] "v16_desague5"       "v16_desague6"       "v16_desagueMissing"
## [22] "p31_afiliado2"      "p31_afiliado3"      "p31_afiliado4"     
## [25] "p31_afiliado9"      "v04_revoq2"         "v04_revoqMissing"  
## [28] "p25_sexo2"          "v12_cocina2"        "v12_cocinaMissing"